# Nimish Bendre BU ID : U18700022
# CS 677 - Term Project - Likes analysis for Spotify
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import missingno as msno
import seaborn as sns
from prettytable import PrettyTable
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import plotly.express as px
from sklearn.feature_selection import SelectFromModel
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score
import sklearn.metrics as metrics
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
# Ignore warning messages
warnings.filterwarnings("ignore")
# Load spotify dataset
df_sy = pd.read_csv("C:\\BU\\Spotify_Youtube.csv")
# Add the Review column to act as class attribute and set the values. <300K = 'Good', >300K and < 1M is "Great'
# and >1M is 'Awesome'
df_sy['Review'] = ['Good' if likes < 300000 else 'Great' if likes < 1000000 else 'Awesome' for likes in df_sy['Likes']]
corr_matrix = df_sy.corr()
sns.heatmap(corr_matrix, annot=True)
plt.show()
# Interesting insights about the Spotify dataset
# Top 5 artists by likes and views
grouped_likes_views = df_sy.groupby('Artist').agg({'Likes': 'sum', 'Views': 'sum'})
# Sort the results by likes and views, and select the top 5 artists
top_artists_by_likes = grouped_likes_views.sort_values('Likes', ascending=False).head(5)
top_artists_by_views = grouped_likes_views.sort_values('Views', ascending=False).head(5)
# Plot the top 5 artists by likes and views using a bar chart
fig, axs = plt.subplots(1, 2, figsize=(10, 5))
# Plot the top 5 artists by likes
axs[0].bar(top_artists_by_likes.index, top_artists_by_likes['Likes'], color='#19ACCE')
axs[0].set_title('Top 5 Artists by Likes')
axs[0].set_xlabel('Artist')
axs[0].set_ylabel('Total Likes')
# Plot the top 5 artists by views
axs[1].bar(top_artists_by_views.index, top_artists_by_views['Views'], color='#146C94')
axs[1].set_title('Top 5 Artists by Views')
axs[1].set_xlabel('Artist')
axs[1].set_ylabel('Total Views')
plt.tight_layout()
plt.show()
# Top 5 Albums by likes and views
grouped_likes_views = df_sy.groupby('Album').agg({'Likes': 'sum', 'Views': 'sum'})
# Sort the results by likes and views, and select the top 5 albums
top_albums_by_likes = grouped_likes_views.sort_values('Likes', ascending=False).head(5)
top_albums_by_views = grouped_likes_views.sort_values('Views', ascending=False).head(5)
# Plot the top 5 albums by likes and views using a bar chart
fig, axs = plt.subplots(1, figsize=(10, 5))
# Plot the top 5 albums by likes
axs.bar(top_albums_by_likes.index, top_albums_by_likes['Likes'], color='#643A6B')
# plt.xticks(top_albums_by_likes.index, rotation=0, ha='center', wrap=True)
axs.set_title('Top 5 Albums by Likes')
axs.set_xlabel('Album')
axs.set_ylabel('Total Likes')
plt.show()
# Plot the top 5 albums by views
fig, axs1 = plt.subplots(1, figsize=(10, 5))
axs1.bar(top_albums_by_views.index, top_albums_by_views['Views'], color='#917FB3')
# plt.xticks(top_albums_by_views.index, rotation=0, ha='center', wrap=True)
axs1.set_title('Top 5 Albums by Views')
axs1.set_xlabel('Album')
axs1.set_ylabel('Total Views')
plt.tight_layout()
plt.show()
# Top 5 Danceable Albums
# Group the data by album and calculate the mean danceability score for each album
grouped = df_sy.groupby('Album').agg({'Danceability': 'mean'})
# Sort the results by danceability score, and select the top 5 albums
top_albums = grouped.sort_values('Danceability', ascending=False).head(5)
# Plot bar chart of the top 5 danceable albums
fig, ax = plt.subplots(figsize=(10, 5))
ax.set_xlim(0.9, 1)
plt.yticks(wrap=True)
ax.barh(top_albums.index, top_albums['Danceability'], color='#5F264A')
ax.set_title('Top 5 Danceable Albums')
ax.set_xlabel('Danceability')
ax.set_ylabel('Album')
plt.show()
# Count of official and unofficial Youtube videos
official_video_counts = df_sy['official_video'].value_counts()
# Create a bar plot of official video counts
fig, ax = plt.subplots()
official_video_counts.plot(kind='bar', ax=ax, color=['#1f77b4', '#ff7f0e'])
ax.set_title('Official Videos')
ax.set_xlabel('Official Video')
ax.set_ylabel('Count')
ax.set_xticklabels(['No', 'Yes'], rotation=0)
plt.show()
# Select the columns you want to calculate the correlation with
feature_columns = ['Danceability', 'Loudness', 'Speechiness', 'Duration_ms', 'Channel', 'Energy']
# Calculate the correlation matrix
corr_matrix = df_sy[feature_columns + ['Review']].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.show()
# Data cleaning
# Removing unwanted columns from the dataset
df_sy.drop(['Url_spotify', 'Uri', 'Url_youtube', 'Title', 'Views', 'Likes', 'Comments', 'Description', 'Stream'],
axis=1, inplace=True)
# Removing the unnamed column from dataset
df_sy = df_sy.drop(df_sy.columns[df_sy.columns.str.contains('Unnamed', case=False)], axis=1)
# Removing the rows with NaN values
df_sy = df_sy.dropna()
# Checking for missing values in dataset
msno.bar(df_sy)
plt.show()
# To enumerate labels into values for enumerated columns in the dataset
# Could not optimize this code due to time constraint
le = LabelEncoder()
df_sy['Artist'] = le.fit_transform(df_sy.Artist.values)
df_sy['Track'] = le.fit_transform(df_sy.Track.values)
df_sy['Album'] = le.fit_transform(df_sy.Album.values)
df_sy['Album_type'] = le.fit_transform(df_sy.Album_type.values)
df_sy['Danceability'] = le.fit_transform(df_sy.Danceability.values)
df_sy['Energy'] = le.fit_transform(df_sy.Energy.values)
df_sy['Key'] = le.fit_transform(df_sy.Key.values)
df_sy['Loudness'] = le.fit_transform(df_sy.Loudness.values)
df_sy['Speechiness'] = le.fit_transform(df_sy.Speechiness.values)
df_sy['Acousticness'] = le.fit_transform(df_sy.Acousticness.values)
df_sy['Instrumentalness'] = le.fit_transform(df_sy.Instrumentalness.values)
df_sy['Liveness'] = le.fit_transform(df_sy.Liveness.values)
df_sy['Valence'] = le.fit_transform(df_sy.Valence.values)
df_sy['Tempo'] = le.fit_transform(df_sy.Tempo.values)
df_sy['Duration_ms'] = le.fit_transform(df_sy.Duration_ms.values)
df_sy['Channel'] = le.fit_transform(df_sy.Channel.values)
df_sy['Licensed'] = le.fit_transform(df_sy.Licensed.values)
df_sy['official_video'] = le.fit_transform(df_sy.official_video.values)
df_sy['Review'] = le.fit_transform(df_sy.Review.values)
# -------------------------------------------------------------------------------------------
# Get the feature dataframe
x_sy_data = df_sy.iloc[:, :-1]
# Get the class dataframe
y_sy_data = df_sy.iloc[:, -1]
# Feature selection algorithm - Extra trees classifier
clf = ExtraTreesClassifier(n_estimators=200, random_state=4)
clf = clf.fit(x_sy_data, y_sy_data)
coef = pd.concat([pd.DataFrame(x_sy_data.columns), pd.DataFrame(np.transpose(clf.feature_importances_))], axis=1)
# print(coef)
coef.columns = ['Feature', 'Importance']
coef.sort_values(by='Importance', inplace=True)
fig = px.bar(coef, x="Feature", y="Importance", title='Feature Importance from Extra Trees classifier',
color='Importance')
fig.show()
# Analysis with the top importance features. Features with coefficient >= 0.6 are taken for analysis
model = SelectFromModel(clf, prefit=True)
feature_idx = model.get_support()
feature_name = x_sy_data.columns[feature_idx]
print('\n', len(feature_name), ' Features selected based on importance - Extra Trees Classifer :', feature_name)
x_sy_new_impfeature = df_sy[list(feature_name)]
14 Features selected based on importance - Extra Trees Classifer : Index(['Artist', 'Track', 'Album', 'Danceability', 'Energy', 'Key', 'Loudness',
'Speechiness', 'Acousticness', 'Liveness', 'Valence', 'Tempo',
'Duration_ms', 'Channel'],
dtype='object')
# Split the dataset into train and test dataset 60% train and 40% test
# Select class attribute for dataset
y_sy_new_class = df_sy.iloc[:, -1]
# Split the dataset into train and test
x_sy_train_new, x_sy_test_new, y_sy_train_new, y_sy_test_new = train_test_split(x_sy_new_impfeature,
y_sy_new_class, test_size=0.6)
# Using K-NN algorithm for prediction and accuracy calculation
# Building the model
K = []
training = []
test = []
scores = {}
cv_scores = []
k_range = range(3, 13, 2)
for k in k_range:
kn_clf = KNeighborsClassifier(n_neighbors=k)
kn_clf.fit(x_sy_train_new, y_sy_train_new)
y_pred = kn_clf.predict(x_sy_test_new)
training_score = kn_clf.score(x_sy_train_new, y_sy_train_new)
test_score = kn_clf.score(x_sy_test_new, y_sy_test_new)
K.append(k)
training.append(training_score)
test.append(test_score)
# scores[k] = [training_score, test_score]
scores = cross_val_score(kn_clf, x_sy_train_new, y_sy_train_new, cv=10, scoring='accuracy')
cv_scores.append(scores.mean())
# Find the best value of k
best_k = k_range[np.argmax(cv_scores)]
print('Best value of k:', best_k)
# Use optimum value of k to calculate performance measures
kn_clf = KNeighborsClassifier(n_neighbors=best_k)
kn_clf.fit(x_sy_train_new, y_sy_train_new)
y_pred_knn = kn_clf.predict(x_sy_test_new)
accuracy_knn = accuracy_score(y_sy_test_new, y_pred_knn) * 100
print("\nAccuracy with K-NN classifier is : ", round(accuracy_knn, 3))
confusion_matrix = metrics.confusion_matrix(y_sy_test_new, y_pred_knn)
print("\nConfusion Matrix - K-NN\n", confusion_matrix)
Best value of k: 11 Accuracy with K-NN classifier is : 63.19 Confusion Matrix - K-NN [[ 240 1632 70] [ 378 7318 252] [ 159 1981 119]]
# Use Gaussian Support vector machine (SVM) Classifier for predictions
svc_clf = SVC(kernel='rbf')
# Train the model
svc_clf.fit(x_sy_train_new, y_sy_train_new)
# Predict the response from dataset
y_pred_svm = svc_clf.predict(x_sy_test_new)
# Calculate model accuracy
accuracy_g_svm = round(metrics.accuracy_score(y_sy_test_new, y_pred_svm) * 100, 2)
# Compute accuracy
print("\nAccuracy with Gaussian SVM classifier is : ", accuracy_g_svm)
confusion_matrix = metrics.confusion_matrix(y_sy_test_new, y_pred_svm)
print("\nConfusion Matrix - SVM Gaussian \n", confusion_matrix)
Accuracy with Gaussian SVM classifier is : 65.42 Confusion Matrix - SVM Gaussian [[ 0 1942 0] [ 0 7948 0] [ 0 2259 0]]
# Use polynomial Support vector machine (SVM) Classifier with degree 2
svc_p_clf = SVC(kernel='poly', degree=2)
# Train the model
svc_p_clf.fit(x_sy_train_new, y_sy_train_new)
# Predict the response from dataset
y_pred_p_svm = svc_p_clf.predict(x_sy_test_new)
# Calculate model accuracy
accuracy_p_svm_score = metrics.accuracy_score(y_sy_test_new, y_pred_p_svm) * 100
# Compute accuracy with polynomial SVM degree 2
print("\nAccuracy with polynomial SVM classifier is : ", round(accuracy_p_svm_score,2))
confusion_matrix = metrics.confusion_matrix(y_sy_test_new, y_pred_p_svm)
print("\nConfusion Matrix - SVM Polynomial\n", confusion_matrix)
# Use Linear Support vector machine (SVM) Classifier for predictions
Accuracy with polynomial SVM classifier is : 65.42 Confusion Matrix - SVM Polynomial [[ 0 1942 0] [ 0 7948 0] [ 0 2259 0]]
# Use Linear Support vector machine (SVM) Classifier for predictions
svc_lin_clf = LinearSVC()
# Train the model
svc_lin_clf.fit(x_sy_train_new, y_sy_train_new)
# Predict the response from dataset
y_pred_lin_svm = svc_lin_clf.predict(x_sy_test_new)
# Calculate model accuracy
accuracy_lin_svm_score = metrics.accuracy_score(y_sy_test_new, y_pred_lin_svm) * 100
# Compute accuracy
print("\nAccuracy with Linear SVM classifier is : ", round(accuracy_lin_svm_score, 2))
confusion_matrix = metrics.confusion_matrix(y_sy_test_new, y_pred_lin_svm)
print("\nConfusion Matrix - Linear SVM \n", confusion_matrix)
Accuracy with Linear SVM classifier is : 63.17 Confusion Matrix - Linear SVM [[ 25 1694 223] [ 45 7491 412] [ 19 2082 158]]
# Use Random Forest Classifier for n = 1 to 10 and depth = 1 to 5
accuracy_score_arr = []
n_tree = []
d_depth = []
df_result = pd.DataFrame()
n = 11
for i in range(1, n):
d = 6
for j in range(1, d):
rfc = RandomForestClassifier(n_estimators=i, max_depth=j)
rfc.fit(x_sy_train_new, y_sy_train_new)
# Prediction
y_prob_rfc = rfc.predict(x_sy_test_new)
# Accuracy calculation
accuracy_all_score = accuracy_score(y_sy_test_new, y_prob_rfc) * 100
n_tree.append(i)
d_depth.append(j)
accuracy_score_arr.append(round(accuracy_all_score, 3))
df_result['N value'] = n_tree
df_result["Depth"] = d_depth
df_result["Accuracy Score"] = accuracy_score_arr
# print(df_result)
# Find the n and depth for maximum accuracy
accuracy_score_arr = df_result.max()
accuracy_rf = round(accuracy_score_arr['Accuracy Score'], 2)
print("\n Random Forest Classifier - Maximum accuracy is obtained for:\n", accuracy_score_arr)
confusion_matrix = metrics.confusion_matrix(y_sy_test_new, y_prob_rfc)
print("\n Confusion Matrix - Random Forest\n", confusion_matrix)
Random Forest Classifier - Maximum accuracy is obtained for: N value 10.000 Depth 5.000 Accuracy Score 65.421 dtype: float64 Confusion Matrix - Random Forest [[ 0 1942 0] [ 0 7948 0] [ 0 2259 0]]
# Use Decision Tree Classifier
dtc = DecisionTreeClassifier(criterion="gini", random_state=100, max_depth=3, min_samples_leaf=5)
# Train the model
dtc.fit(x_sy_train_new, y_sy_train_new)
# Prediction
y_prob_dtc = dtc.predict(x_sy_test_new)
# Accuracy calculation
accuracy_dt = accuracy_score(y_sy_test_new, y_prob_dtc) * 100
# 1 - Compute accuracy for year 2
print("\nAccuracy with Decision Tree classifier is : ", round(accuracy_dt, 2))
confusion_matrix = metrics.confusion_matrix(y_sy_test_new, y_prob_dtc)
print("\nConfusion Matrix - Decision Tree classifier\n", confusion_matrix)
Accuracy with Decision Tree classifier is : 65.34 Confusion Matrix - Decision Tree classifier [[ 0 1940 2] [ 0 7936 12] [ 0 2257 2]]
# -------------AdaBoost------------------
# For λ = 0.5 and λ = 1, constructed an Adaboost classifier with two base estimators of logistic regression
# and Naive Bayesian.
# Initialize base estimators
lr_clf = LogisticRegression()
nb_clf = GaussianNB()
classifiers = [lr_clf, nb_clf]
lambdas = [0.5, 1]
N = list(range(1, 16))
for lam in lambdas:
for clf in classifiers:
clf_name = type(clf).__name__
error_rates = []
for n in N:
# Train Adaboost classifier with n weak learners and base estimator
ada_clf = AdaBoostClassifier(base_estimator=clf, n_estimators=n, learning_rate=lam)
ada_clf.fit(x_sy_train_new, y_sy_train_new)
# Predict on year 2 data and calculate error rate
y_pred_ada = ada_clf.predict(x_sy_test_new)
error_rate = 1 - accuracy_score(y_sy_test_new, y_pred_ada)
error_rates.append(error_rate)
# Plot the error rates
plt.plot(N, error_rates, label=clf_name)
plt.xlabel('N ')
plt.ylabel('Error rate')
plt.title('Adaboost error rate for ' + clf_name + ' with λ= ' + str(lam))
plt.legend()
# plt.show()
# Best value of N* for each base estimator for lambda = 0.5
best_Ns = {}
best_accuracy = {}
for base_estimator in classifiers:
error_rates = []
accuracy_list = []
# Iterate over the range of N values
for n in N:
ada_clf = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=n, learning_rate=0.5)
ada_clf.fit(x_sy_train_new, y_sy_train_new)
y_pred_ada_clf = ada_clf.predict(x_sy_test_new)
accuracy_ada_clf = accuracy_score(y_sy_test_new, y_pred_ada_clf)
accuracy_list.append(accuracy_ada_clf)
error_rate_ada_clf = round(1 - accuracy_ada_clf, 2)
error_rates.append(error_rate_ada_clf)
# Find the index of the minimum error rate
best_N_idx = np.argmin(error_rates)
best_Ns[str(base_estimator)] = N[best_N_idx]
best_accuracy[str(base_estimator)] = accuracy_list[best_N_idx]
# Print the best N* for each base estimator
for estimator, best_N in best_Ns.items():
estimator = estimator.replace("(", " ").replace(")", " ")
# print('\nBest value of N* for',estimator,'is : ', best_N)
# print(best_accuracy)
accuracy_gnb_adab = round(best_accuracy['GaussianNB()'], 2) * 100
accuracy_lr_adab = round(best_accuracy['LogisticRegression()'], 2) * 100
# Accuracy for each base estimator
for estimator, best_acc in best_accuracy.items():
estimator = estimator.replace("(", " ").replace(")", " ")
print('\nAccuracy with AdaBoost classifier and ', estimator, 'is : ', round(best_acc, 2))
Accuracy with AdaBoost classifier and LogisticRegression is : 0.65 Accuracy with AdaBoost classifier and GaussianNB is : 0.66
# Logistic regression model
lrm = LogisticRegression()
lrm.fit(x_sy_train_new, y_sy_train_new)
# Predict
y_pred_lr = lrm.predict(x_sy_test_new)
# Logistic regression co-efficients
print("Logistic regression coefficients:")
print("b0 = ", lrm.intercept_)
print("b1 = ", lrm.coef_[0][0])
print("b2 = ", lrm.coef_[0][1])
# Accuracy and Confusion Matrix
accuracy_log_reg = round(accuracy_score(y_sy_test_new, y_pred_lr), 2) * 100
print("\nAccuracy with Logistic Regression classifier is :", accuracy_log_reg)
cm = metrics.confusion_matrix(y_sy_test_new, y_pred_lr)
print("\nConfusion Matrix - Logistic Regression classifier : \n", cm)
Logistic regression coefficients: b0 = [-1.20383600e-06 2.13776038e-06 -9.33924382e-07] b1 = -7.337901579735785e-05 b2 = -6.4643313801626185e-06 Accuracy with Logistic Regression classifier is : 65.0 Confusion Matrix - Logistic Regression classifier : [[ 88 1854 0] [ 91 7853 4] [ 59 2200 0]]
# Tabular demonstration of accuracy and classifiers
# Summarize the table
print('\nSummary Table with features by importance using \nExtra Trees Classifier with importance > 0.6')
columns = ['Classifier', 'Accuracy %']
hdr = ['K-NN', 'SVM-Gaussian', 'SVM-Polynomial', 'SVM-Linear', 'Random Forest', 'Decision Tree',
'AdaBoost with LogisticRegression', 'AdaBoost with Gaussian Naive Bayes', 'LogisticRegression']
c1 = [round(accuracy_knn, 2), accuracy_g_svm, round(accuracy_p_svm_score, 2), round(accuracy_lin_svm_score, 2)
, accuracy_rf, round(accuracy_dt, 2), accuracy_lr_adab, accuracy_gnb_adab, accuracy_log_reg]
class_Table = PrettyTable()
class_Table.add_column(columns[0], hdr)
class_Table.add_column(columns[1], c1)
print(class_Table)
Summary Table with features by importance using Extra Trees Classifier with importance > 0.6 +------------------------------------+------------+ | Classifier | Accuracy % | +------------------------------------+------------+ | K-NN | 63.19 | | SVM-Gaussian | 65.42 | | SVM-Polynomial | 65.42 | | SVM-Linear | 63.17 | | Random Forest | 65.42 | | Decision Tree | 65.34 | | AdaBoost with LogisticRegression | 65.0 | | AdaBoost with Gaussian Naive Bayes | 66.0 | | LogisticRegression | 65.0 | +------------------------------------+------------+
# ----------- Code Start - Dataset with Top 4 features by importance using Extra Trees Classifier --------------
# Take the top 4 features by importance for the analysis
x_sy_top4 = df_sy[['Loudness','Duration_ms','Danceability','Speechiness']]
# print(x_sy_top4.head(10))
# Split the dataset into train and test dataset 60% train and 40% test
# Select class attribute for dataset
y_sy_top4 = df_sy.iloc[:, -1]
# Split the dataset into train and test
x_sy_train_top4, x_sy_test_top4, y_sy_train_top4, y_sy_test_top4 = train_test_split(x_sy_top4,
y_sy_top4, test_size=0.6)
# Using K-NN algorithm for prediction and accuracy calculation
# Building the model
K = []
training = []
test = []
scores = {}
cv_scores = []
k_range = range(3,13,2)
for k in k_range:
kn_clf = KNeighborsClassifier(n_neighbors=k)
kn_clf.fit(x_sy_train_top4, y_sy_train_top4)
y_pred = kn_clf.predict(x_sy_test_top4)
training_score = kn_clf.score(x_sy_train_top4, y_sy_train_top4)
test_score = kn_clf.score(x_sy_test_top4, y_sy_test_top4)
K.append(k)
training.append(training_score)
test.append(test_score)
# scores[k] = [training_score, test_score]
scores = cross_val_score(kn_clf, x_sy_train_top4, y_sy_train_top4, cv=10, scoring='accuracy')
cv_scores.append(scores.mean())
# Find the best value of k
best_k = k_range[np.argmax(cv_scores)]
print('Best value of k:', best_k)
# Use optimum value of k to calculate performance measures
kn_clf = KNeighborsClassifier(n_neighbors=best_k)
kn_clf.fit(x_sy_train_top4, y_sy_train_top4)
y_pred_knn = kn_clf.predict(x_sy_test_top4)
accuracy_knn = round(accuracy_score(y_sy_test_top4,y_pred_knn) * 100,2)
print("\nAccuracy with K-NN classifier is : ", accuracy_knn)
confusion_matrix = metrics.confusion_matrix(y_sy_test_top4, y_pred_knn)
print("\nConfusion Matrix - K-NN\n",confusion_matrix)
Best value of k: 11 Accuracy with K-NN classifier is : 63.03 Confusion Matrix - K-NN [[ 204 1634 95] [ 405 7340 256] [ 153 1948 114]]
# Use Gaussian Support vector machine (SVM) Classifier for predictions
svc_clf = SVC(kernel='rbf')
# Train the model
svc_clf.fit(x_sy_train_top4, y_sy_train_top4)
# Predict the response from dataset
y_pred_svm = svc_clf.predict(x_sy_test_top4)
# Calculate model accuracy
accuracy_g_svm = round(metrics.accuracy_score(y_sy_test_top4, y_pred_svm) * 100,2)
# Compute accuracy
print("\nAccuracy with Gaussian SVM classifier is : ", accuracy_g_svm)
confusion_matrix = metrics.confusion_matrix(y_sy_test_top4, y_pred_svm)
print("\nConfusion Matrix - SVM Gaussian \n",confusion_matrix)
Accuracy with Gaussian SVM classifier is : 65.86 Confusion Matrix - SVM Gaussian [[ 0 1933 0] [ 0 8001 0] [ 0 2215 0]]
# Use polynomial Support vector machine (SVM) Classifier with degree 2
svc_p_clf = SVC(kernel='poly', degree=2)
# Train the model
svc_p_clf.fit(x_sy_train_top4, y_sy_train_top4)
# Predict the response from dataset
y_pred_p_svm = svc_p_clf.predict(x_sy_test_top4)
# Calculate model accuracy
accuracy_p_svm_score = metrics.accuracy_score(y_sy_test_top4, y_pred_p_svm) * 100
# Compute accuracy with polynomial SVM degree 2
print("\nAccuracy with polynomial SVM classifier is : ", round(accuracy_p_svm_score, 2))
confusion_matrix = metrics.confusion_matrix(y_sy_test_top4, y_pred_p_svm)
print("\n Confusion Matrix - SVM Polynomial\n",confusion_matrix)
Accuracy with polynomial SVM classifier is : 65.86 Confusion Matrix - SVM Polynomial [[ 0 1933 0] [ 0 8001 0] [ 0 2215 0]]
# Use Linear Support vector machine (SVM) Classifier for predictions
svc_lin_clf = LinearSVC()
# Train the model
svc_lin_clf.fit(x_sy_train_top4, y_sy_train_top4)
# Predict the response from dataset
y_pred_lin_svm = svc_lin_clf.predict(x_sy_test_top4)
# Calculate model accuracy
accuracy_lin_svm_score = metrics.accuracy_score(y_sy_test_top4, y_pred_lin_svm) * 100
# Compute accuracy
print("\nAccuracy with Linear SVM classifier is : ", round(accuracy_lin_svm_score, 2))
confusion_matrix = metrics.confusion_matrix(y_sy_test_top4, y_pred_lin_svm)
print("\nConfusion Matrix - Linear SVM \n",confusion_matrix)
Accuracy with Linear SVM classifier is : 46.33 Confusion Matrix - Linear SVM [[ 739 1169 25] [2827 4843 331] [ 907 1261 47]]
# Use Random Forest Classifier for n = 1 to 10 and depth = 1 to 5
accuracy_score_arr = []
n_tree = []
d_depth = []
df_result = pd.DataFrame()
n = 11
for i in range(1, n):
d = 6
for j in range(1, d):
rfc = RandomForestClassifier(n_estimators=i, max_depth=j)
rfc.fit(x_sy_train_top4, y_sy_train_top4)
# Prediction
y_prob_rfc = rfc.predict(x_sy_test_top4)
# Accuracy calculation
accuracy_all_score = accuracy_score(y_sy_test_top4, y_prob_rfc) * 100
n_tree.append(i)
d_depth.append(j)
accuracy_score_arr.append(round(accuracy_all_score, 3))
df_result['N value'] = n_tree
df_result["Depth"] = d_depth
df_result["Accuracy Score"] = accuracy_score_arr
# print(df_result)
# Find the n and depth for maximum accuracy
accuracy_score_arr = df_result.max()
accuracy_rf = round(accuracy_score_arr['Accuracy Score'],2)
print("\n Random Forest Classifier - Maximum accuracy is obtained for:\n", accuracy_score_arr)
confusion_matrix = metrics.confusion_matrix(y_sy_test_top4, y_prob_rfc)
print("\n Confusion Matrix - Random Forest\n",confusion_matrix)
Random Forest Classifier - Maximum accuracy is obtained for: N value 10.000 Depth 5.000 Accuracy Score 65.923 dtype: float64 Confusion Matrix - Random Forest [[ 0 1933 0] [ 0 8001 0] [ 0 2215 0]]
# Use Decision Tree Classifier
dtc = DecisionTreeClassifier(criterion="gini", random_state=100, max_depth=3, min_samples_leaf=5)
# Train the model
dtc.fit(x_sy_train_top4, y_sy_train_top4)
# Prediction
y_prob_dtc = dtc.predict(x_sy_test_top4)
# Accuracy calculation
accuracy_dt = accuracy_score(y_sy_test_top4, y_prob_dtc) * 100
print("\nAccuracy with Decision Tree classifier is : ", round(accuracy_dt, 2))
confusion_matrix = metrics.confusion_matrix(y_sy_test_top4, y_prob_dtc)
print("\nConfusion Matrix - Decision Tree classifier\n",confusion_matrix)
Accuracy with Decision Tree classifier is : 65.86 Confusion Matrix - Decision Tree classifier [[ 0 1933 0] [ 0 8001 0] [ 0 2215 0]]
#-------------AdaBoost------------------
# For λ = 0.5 and λ = 1, constructed an Adaboost classifier with two base estimators of logistic regression
# and Naive Bayesian.
# Initialize base estimators
lr_clf = LogisticRegression()
nb_clf = GaussianNB()
classifiers = [lr_clf, nb_clf]
lambdas = [0.5, 1]
N = list(range(1, 16))
for lam in lambdas:
for clf in classifiers:
clf_name = type(clf).__name__
error_rates = []
for n in N:
# Train Adaboost classifier with n weak learners and base estimator
ada_clf = AdaBoostClassifier(base_estimator=clf, n_estimators=n, learning_rate=lam)
ada_clf.fit(x_sy_train_top4, y_sy_train_top4)
# Predict on year 2 data and calculate error rate
y_pred_ada = ada_clf.predict(x_sy_test_top4)
error_rate = 1 - accuracy_score(y_sy_test_top4, y_pred_ada)
error_rates.append(error_rate)
# Plot the error rates
plt.plot(N, error_rates, label=clf_name)
plt.xlabel('N ')
plt.ylabel('Error rate')
plt.title('Adaboost error rate for ' + clf_name + ' with λ= ' + str(lam))
plt.legend()
# plt.show()
# Best value of N* for each base estimator for lambda = 0.5
best_Ns = {}
best_accuracy = {}
for base_estimator in classifiers:
error_rates = []
accuracy_list = []
# Iterate over the range of N values
for n in N:
ada_clf = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=n, learning_rate=0.5)
ada_clf.fit(x_sy_train_top4, y_sy_train_top4)
y_pred_ada_clf = ada_clf.predict(x_sy_test_top4)
accuracy_ada_clf = accuracy_score(y_sy_test_top4, y_pred_ada_clf)
accuracy_list.append(accuracy_ada_clf)
error_rate_ada_clf = round(1 - accuracy_ada_clf, 2)
error_rates.append(error_rate_ada_clf)
# Find the index of the minimum error rate
best_N_idx = np.argmin(error_rates)
best_Ns[str(base_estimator)] = N[best_N_idx]
best_accuracy[str(base_estimator)] = accuracy_list[best_N_idx]
# Print the best N* for each base estimator
for estimator, best_N in best_Ns.items():
estimator = estimator.replace("(", " ").replace(")", " ")
# print('\nBest value of N* for',estimator,'is : ', best_N)
# print(best_accuracy)
accuracy_gnb_adab = round(best_accuracy['GaussianNB()'],2) * 100
accuracy_lr_adab = round(best_accuracy['LogisticRegression()'],2) * 100
# Accuracy for each base estimator
for estimator, best_acc in best_accuracy.items():
estimator = estimator.replace("(", " ").replace(")", " ")
print('\nAccuracy with AdaBoost classifier and ', estimator, 'is : ', round(best_acc,2))
Accuracy with AdaBoost classifier and LogisticRegression is : 0.66 Accuracy with AdaBoost classifier and GaussianNB is : 0.66
# Logistic regression model
lrm = LogisticRegression()
lrm.fit(x_sy_train_top4, y_sy_train_top4)
# Predict
y_pred_lr = lrm.predict(x_sy_test_top4)
# Logistic regression co-efficients
print("Logistic regression coefficients:")
print("b0 = ", lrm.intercept_)
print("b1 = ", lrm.coef_[0][0])
print("b2 = ", lrm.coef_[0][1])
# Accuracy and Confusion Matrix
accuracy_log_reg = round(accuracy_score(y_sy_test_top4, y_pred_lr),2) * 100
print("\nAccuracy with Logistic Regression classifier is :", accuracy_log_reg)
cm = metrics.confusion_matrix(y_sy_test_top4, y_pred_lr)
print("\nConfusion Matrix - Logistic Regression classifier : \n", cm)
Logistic regression coefficients: b0 = [-7.30865541e-06 1.24016122e-05 -5.09295677e-06] b1 = 3.0808607640375015e-05 b2 = -2.5135848094315654e-05 Accuracy with Logistic Regression classifier is : 66.0 Confusion Matrix - Logistic Regression classifier : [[ 0 1933 0] [ 10 7991 0] [ 1 2214 0]]
# Tabular demonstration of accuracy and classifiers
# Summarize the table
print('\nSummary Table with Top 4 features by importance using \nExtra Trees Classifier')
columns = ['Classifier', 'Accuracy %']
hdr = ['K-NN','SVM-Gaussian','SVM-Polynomial','SVM-Linear','Random Forest','Decision Tree',
'AdaBoost with LogisticRegression','AdaBoost with Gaussian Naive Bayes','LogisticRegression']
c1 = [round(accuracy_knn,2),accuracy_g_svm,round(accuracy_p_svm_score,2),round(accuracy_lin_svm_score,2)
,accuracy_rf,round(accuracy_dt,2),accuracy_lr_adab,accuracy_gnb_adab,accuracy_log_reg]
class_Table = PrettyTable()
class_Table.add_column(columns[0], hdr)
class_Table.add_column(columns[1], c1)
print(class_Table)
Summary Table with Top 4 features by importance using Extra Trees Classifier +------------------------------------+------------+ | Classifier | Accuracy % | +------------------------------------+------------+ | K-NN | 63.03 | | SVM-Gaussian | 65.86 | | SVM-Polynomial | 65.86 | | SVM-Linear | 46.33 | | Random Forest | 65.92 | | Decision Tree | 65.86 | | AdaBoost with LogisticRegression | 66.0 | | AdaBoost with Gaussian Naive Bayes | 66.0 | | LogisticRegression | 66.0 | +------------------------------------+------------+